/* 
    Purpose: Using the 2000 Census, this file locates black and white men aged 
             30-50 who are fathers of a child younger than 18 in the same household. 
             Other variables necessary to create average predicted father income 
             are also cleaned, and then average predicted father income 
             (i.e., "income scores") is calculated at the preferred 
             occupation x race x south level. 

    Note: Income was asked of all individuals 15+ in the 2000 Census, 
          so there's no need to use a sample line weight. 

    Creates: incomescores_fathers2000_byrace_bysouth.dta
*/
clear
set more off
cd "$Mydirectory1/1_DataSources/CensusData/"

use ./input/Census_1910to2010_1pct_raw.dta, clear //download from IPUMS USA
    keep if year==2000
    sum perwt /*Note: Respondents are assigned different weights, 
                      so all collapses must be weighted. */

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** SET UP INCOME VARIABLES
***************************

* Fix income variables 
    sum inctot, d
    replace inctot=. if inctot==9999999 
    replace inctot=0 if inctot<0
    
* Household income   
    gen temp = inctot
    replace temp = 0 if temp==. 
    
    bysort serial: egen hh_income = sum(temp)
    
/*
   Convert income variables to 1950 dollars using the CPI: 
   Source: https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-)
*/
    gen CPI2000 = 172.2
    gen CPI1950= 24.1
    
    foreach var of varlist inctot hh_income {
    replace `var' = `var' * (CPI1950 / CPI2000)
    }
        
    tempfile fulldata
    save `fulldata'

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** IDENTIFY FATHERS
***************************
    
    keep if age<18 //Restrict to children younger than 18
    keep serial poploc age
    
    replace poploc=. if poploc==0 
    drop if poploc==. //Exclude children without a father in the house
    
    bysort serial poploc: keep if _n==1 //Keep all unique father ids. Some fathers will have multiple children in the Census. 
    rename poploc pernum
    drop age

    tempfile children 
    save `children'

    
* Keep the sample of fathers 
    use `fulldata', clear
    merge 1:1 serial pernum using `children'
    keep if _merge==3 
    drop _merge
    
* Keep black and white fathers aged 30 to 50
    keep if inrange(age,30,50) 
    keep if race==1 | race==2 
    
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

/* Note: Per Census documentation, OCCSCORE is a constructed 
         2-digit numeric variable that assigns occupational 
         income scores to each occupation. OCCSCORE represents 
         the median total income (in hundreds of 1950 dollars) 
         of all persons with that particular occupation in 1950. 
*/
    replace occscore=occscore*100
    
    tab statefip, m
    rename statefip fips
    
* Region of current residence 
    rename region region_og

    * Northeast: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont, New Jersey, New York, Pennsylvania
    * Midwest: Illinois, Indiana, Michigan, Ohio, Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, South Dakota
    /* South: Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia, West Virginia, Alabama,
              Kentucky, Mississippi, Tennessee, Arkansas, Louisiana, Oklahoma, Texas */
    * West: Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah, Wyoming, California, Oregon, Washington --note: Census puts AK and HI in with Pacific division

    gen region=.
    replace region=1 if fips==9 | fips==23 | fips==25 | fips==33 | fips==44 | fips==50 | fips==34 | fips==36 | fips==42
    replace region=2 if fips==17 | fips==18 | fips==26 | fips==39 | fips==55 | fips==19 | fips==20 | fips==27 | fips==29 | fips==31 | fips==38 | fips==46 
    replace region=3 if fips==10 | fips==11 | fips==12 | fips==13 | fips==24 | fips==37 | fips==45 | fips==51 | fips==54 ///
    | fips==1 | fips==21 | fips==28 | fips==47 | fips==5 | fips==22 | fips==40 | fips==48 
    replace region=4 if fips==4 | fips==8 | fips==16 | fips==30 | fips==32 | fips==35 | fips==49 | fips==56 | fips==6 | fips==41 | fips==53 | fips==2 | fips==15 
    tab region fips if region==., m 
    
    gen south_merge = region==3
    tab region_og south_merge
    
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

****************
**** ASSIGN COARSENED OCCS
****************

* Set up variables
    sort occ1950
    replace occ1950=. if occ1950>=980

* Count # of Census occupations in 2000 data
    bysort occ1950: gen nvals = _n ==1
    count if nvals==1 

* Separate people with occupations in 200's based on self-employment
    replace occ1950=occ1950+1000 if (occ1950>=200 & occ1950<=290) & classwkr==1
    
* Crosswalk Census occupations to coarsened ANES occupations
    merge m:1 occ1950 using ../Crosswalks/Crosswalk_1950Census_toANES.dta
    assert _merge!=1    
    drop if _merge==2
    drop _merge
    
    assert occ1950==. if occ1950ej==.
    drop if occ1950ej==. 
    
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

*******************
*** TEMPLATES
*******************

    gen number=1
  
* Template 1: occupation x race
preserve

    collapse (min) race, by(occ1950ej)
    expand 2 in 1
    replace occ1950ej=65 if _n==28 //ensures private household workers are in template
    
    expand 2
    bysort occ1950ej: replace race=2 if _n==2

    tempfile occbyrace
    save `occbyrace'

restore 
    
* Template 2: occupation x race x south
preserve 
    
    collapse (min) race, by(south_merge)
    expand 2
    bysort south_merge: replace race=2 if _n==2
    
    tempfile south
    save `south'
    
restore
    
* Combine templates 1 & 2
preserve

    use `occbyrace', clear
    joinby race using `south'
    
    tempfile template
    save `template'
    
restore

*******************
*** COLLAPSE 
*******************

    collapse (rawsum) number  (mean) inctot hh_income [aw=perwt], by(occ1950ej race south_merge) 

    tempfile income
    save `income'

* Merge collapsed income scores with template 
    use `template'
    merge 1:1 occ1950ej race south_merge using `income'
    drop _merge

****************
*** IMPUTATIONS
****************

    bysort south_merge: tab race occ1950ej if inctot==.
 
/* Give black farmers in the south the average 
   predicted income of black farmers in the non-south */
    foreach var of varlist inctot hh_income {
        sum `var' if race==2 & occ1950ej==81 & south_merge==1
        replace `var'=`r(mean)' if race==2 & occ1950ej==81 & south_merge==0   
    }

/* Give private household workers (occ=65) the average predicted 
   income of other service workers (occ=68) of the same race and 
   in the same region (south v nonsouth) */   
    foreach i in 1 2 {
        foreach j in 0 1 {
            foreach var of varlist inctot hh_income {
                sum `var' if race==`i' & south_merge==`j' & occ1950ej==68
                replace `var' = `r(mean)' if race==`i' & south_merge==`j' & occ1950ej==65           
            }
        }
    }
    assert inctot!=. 
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*
****************
**** SAVE 
****************
    
    label var inctot "Coarse income score, mean, 2000, personal income, by race and south"
    label var hh_income "Coarse income score, mean, 2000, household income, by race and south"
    
    label var race "Respondent race (2000 Census)"
    label var number "Number of obs in occ by race by south cell"

    rename inctot avg_inctot_2000_byocc_byr_bys
    rename hh_income avg_HHinc_2000_byocc_byr_bys

    rename occ1950ej fatheroccej
    rename number number_2000obs_byocc_byr_bys

    save ./output/incomescores_fathers2000_byrace_bysouth.dta, replace
